# Peri 12/3/24 adopted and re-writtenn using biopython
# inspired by the excellent work of Aravind CR (neptune.ai MLOps blog)
!pip install biopython
!pip install -U spacy
!pip install tqdm
Requirement already satisfied: biopython in c:\programdata\anaconda3\lib\site-packages (1.83) Requirement already satisfied: numpy in c:\programdata\anaconda3\lib\site-packages (from biopython) (1.23.5)
WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages)
Requirement already satisfied: spacy in c:\programdata\anaconda3\lib\site-packages (3.7.4) Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in c:\programdata\anaconda3\lib\site-packages (from spacy) (2.6.1) Requirement already satisfied: typer<0.10.0,>=0.3.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (0.9.0) Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (3.3.0) Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (2.28.1) Requirement already satisfied: thinc<8.3.0,>=8.2.2 in c:\programdata\anaconda3\lib\site-packages (from spacy) (8.2.3) Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (1.0.10) Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\programdata\anaconda3\lib\site-packages (from spacy) (2.0.8) Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in c:\programdata\anaconda3\lib\site-packages (from spacy) (2.0.10) Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in c:\programdata\anaconda3\lib\site-packages (from spacy) (1.1.2) Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from spacy) (3.0.9) Requirement already satisfied: numpy>=1.19.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (1.23.5) Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (22.0) Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in c:\programdata\anaconda3\lib\site-packages (from spacy) (5.2.1) Requirement already satisfied: weasel<0.4.0,>=0.1.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (0.3.4) Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in c:\programdata\anaconda3\lib\site-packages (from spacy) (3.0.12) Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (1.0.5) Requirement already satisfied: setuptools in c:\programdata\anaconda3\lib\site-packages (from spacy) (65.6.3) Requirement already satisfied: srsly<3.0.0,>=2.4.3 in c:\programdata\anaconda3\lib\site-packages (from spacy) (2.4.8) Requirement already satisfied: jinja2 in c:\programdata\anaconda3\lib\site-packages (from spacy) (3.1.2) Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\programdata\anaconda3\lib\site-packages (from spacy) (4.64.1) Requirement already satisfied: typing-extensions>=4.6.1 in c:\programdata\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.9.0) Requirement already satisfied: annotated-types>=0.4.0 in c:\programdata\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0) Requirement already satisfied: pydantic-core==2.16.2 in c:\programdata\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.16.2) Requirement already satisfied: charset-normalizer<3,>=2 in c:\programdata\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.4) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14) Requirement already satisfied: idna<4,>=2.5 in c:\programdata\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4) Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.12.7) Requirement already satisfied: blis<0.8.0,>=0.7.8 in c:\programdata\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11) Requirement already satisfied: confection<1.0.0,>=0.0.1 in c:\programdata\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4) Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.38.0->spacy) (0.4.6) Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\programdata\anaconda3\lib\site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.0.4) Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in c:\programdata\anaconda3\lib\site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\programdata\anaconda3\lib\site-packages (from jinja2->spacy) (2.1.5)
WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (4.64.1) Requirement already satisfied: colorama in c:\programdata\anaconda3\lib\site-packages (from tqdm) (0.4.6)
WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages)
from Bio import Entrez
import pandas as pd
import numpy as np
def search(query):
# Set the email address associated with the Entrez utility
Entrez.email = 'periklis.kontoroupis@tno.nl'
# Use Entrez.esearch to search PubMed with the provided query
# Parameters:
# - db: specifies the PubMed database
# - sort: sorts the results by relevance
# - retmax: specifies the maximum number of results to return
# - retmode: specifies the format of the results to be returned (XML)
# - term: specifies the search query
handle = Entrez.esearch(db='pubmed',
sort='relevance',
retmax='10000',
retmode='xml',
term=query)
# Read the search results
results = Entrez.read(handle)
# Return the results
return results
studies = search('perovskite solar cells')
studiesIdList = studies['IdList']
def fetch_details(id_list):
# Join the list of IDs into a comma-separated string
ids = ','.join(id_list)
# Set the email address associated with the Entrez utility
Entrez.email = 'periklis.kontoroupis@tno.nl'
# Use Entrez.efetch to fetch details for the provided IDs from the PubMed database
handle = Entrez.efetch(db='pubmed',
retmode='xml',
id=ids)
# Read the fetched results
results = Entrez.read(handle)
# Return the results
return results
# Initialize empty lists to store data
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []
# Fetch details for the list of studies using a function called fetch_details
studies = fetch_details(studiesIdList)
# Define chunk size for processing studies in batches
chunk_size = 10000
# Iterate over chunks of studiesIdList
for chunk_i in range(0, len(studiesIdList), chunk_size):
# Get a chunk of studies
chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
# Fetch details for the current chunk
papers = fetch_details(chunk)
# Iterate over each paper in the chunk
for i, paper in enumerate(papers['PubmedArticle']):
# Extract and append title to title_list
title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
# Try to extract abstract; if not available, append 'No Abstract'
try:
abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
except:
abstract_list.append('No Abstract')
# Extract and append journal title to journal_list
journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
# Extract and append language to language_list
language_list.append(paper['MedlineCitation']['Article']['Language'][0])
# Try to extract publication year; if not available, append 'No Data'
try:
pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
except:
pubdate_year_list.append('No Data')
# Try to extract publication month; if not available, append 'No Data'
try:
pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
except:
pubdate_month_list.append('No Data')
# Create a DataFrame using the collected lists of data
df = pd.DataFrame(list(zip(
title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
)),
columns=[
'Title', 'Abstract', 'Journal', 'Language', 'Year','Month'
])
df
| Title | Abstract | Journal | Language | Year | Month | |
|---|---|---|---|---|---|---|
| 0 | Stable Tin-Based Perovskite Solar Cells. | The developments in halide perovskite research... | ACS energy letters | eng | 2023 | Apr |
| 1 | Nanostructured Perovskite Solar Cells. | Over the past decade, lead halide perovskites ... | Nanomaterials (Basel, Switzerland) | eng | 2019 | Oct |
| 2 | Interfacial modification in perovskite-based t... | With photovoltaic performance of metal halide ... | Nano convergence | eng | 2023 | May |
| 3 | All-Inorganic Perovskite Solar Cells: Recent A... | Organic-inorganic metal-halide-based hybrid pe... | Nanomaterials (Basel, Switzerland) | eng | 2022 | May |
| 4 | Perovskite solar cells: must lead be replaced ... | Perovskite solar cells have recently drawn sig... | Science and technology of advanced materials | eng | 2018 | No Data |
| ... | ... | ... | ... | ... | ... | ... |
| 7327 | [Ba<sub>4</sub>Cl] Cations Directed Perovskite... | Novel 3D metal formate frameworks {[Ba<sub>4</... | Inorganic chemistry | eng | 2022 | Jan |
| 7328 | A-site deficient semiconductor electrolyte Sr<... | Fast ionic conduction at low operating tempera... | RSC advances | eng | 2022 | Aug |
| 7329 | Synthesis and Characterization of New Conjugat... | A new series of thiophene-based azomethines di... | International journal of molecular sciences | eng | 2022 | Jul |
| 7330 | Band Structure Engineering of Interfacial Semi... | To explore new constituents in two-dimensional... | Advanced materials (Deerfield Beach, Fla.) | eng | 2019 | Apr |
| 7331 | Visualizing the Anomalous Charge Density Wave ... | Metallic layered transition metal dichalcogeni... | Advanced materials (Deerfield Beach, Fla.) | eng | 2020 | Nov |
7332 rows × 6 columns
import spacy
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_entities(sents):
# chunk one
enti_one = ""
enti_two = ""
dep_prev_token = "" # dependency tag of previous token in sentence
txt_prev_token = "" # previous token in sentence
prefix = ""
modifier = ""
for tokn in nlp(sents):
# chunk two
## move to next token if token is punctuation
if tokn.dep_ != "punct":
# check if token is compound word or not
if tokn.dep_ == "compound":
prefix = tokn.text
# add the current word to it if the previous word is 'compound’
if dep_prev_token == "compound":
prefix = txt_prev_token + " "+ tokn.text
# verify if token is modifier or not
if tokn.dep_.endswith("mod") == True:
modifier = tokn.text
# add it to the current word if the previous word is 'compound'
if dep_prev_token == "compound":
modifier = txt_prev_token + " "+ tokn.text
# chunk3
if tokn.dep_.find("subj") == True:
enti_one = modifier +" "+ prefix + " "+ tokn.text
prefix = ""
modifier = ""
dep_prev_token = ""
txt_prev_token = ""
# chunk4
if tokn.dep_.find("obj") == True:
enti_two = modifier +" "+ prefix +" "+ tokn.text
# chunk 5
# update variable
dep_prev_token = tokn.dep_
txt_prev_token = tokn.text
return [enti_one.strip(), enti_two.strip()]
# demonstrate how an entity is extracted, here I use a sample document I wrote for the perovskites report
extract_entities("considered the exposure scenario of halide solar cell panels in operation, failing due to harsh environmental conditions. This may create a lead leakage in the nearby water bodies, an ecotoxicology problem. Effectively, the relatively high-water solubility of lead halide salts comprising perovskite may decompose to PbI2, hydroiodic acid, and methylamine. The degradation products could be evaluated with regard to ecotoxicity (e.g. Zebrafsh, Daphnia and C. elegans media)")
['hydroiodic degradation products', 'ecotoxicity']
from tqdm import tqdm
pairs_of_entities = []
for abstract in tqdm(df['Abstract'][:800]):
pairs_of_entities.append(extract_entities(str(abstract)))
100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:28<00:00, 28.04it/s]
# a random pair of entities
pairs_of_entities[10:40]
[['Furthermore Sn we', 'mixed Sn perovskites'], ['aromatic design that', 'efficient 2D PSCs'], ['detailed loss guidelines', 'solar performance cells'], ['inorganic perovskite outlook', 'inorganic interface view'], ['how degree caesium', 'strongly perovskites'], ['also it', 'important charge conduction'], ['free perovskites', 'solar cells'], ['stabilizing methods', 'when start future'], ['Finally market we', 'high efficiency PSCs'], ['relevant issues', 'periodic metal elements'], ['Lastly instability challenges', 'containing PSCs'], ['high power conversion attributes', 'renewable energy sources'], ['perovskite film methodologies', 'various solution processes'], ['evolutionary characterization trajectory', 'solar research cells'], ['photonic chemical We', 'solar performance cells'], ['selective ion migration', 'closely voltage stability'], ['carefully which', 'efficient tandem application'], ['future metal halide perspectives', 'inorganic metal halide materials'], ['Mixed Sn halide perovskites', 'solar cells'], ['specifically Mitigation strategies', 'residual perovskite films'], ['informative study', 'atmospheric conditions'], ['solar research cells', 'layered research halide'], ['Collectively bandgap strategies', '23 power conversion %'], ['future research review', 'other PSCs'], ['solar development', 'solar cell technologies'], ['comprehensive improvement understanding', 'based end'], ['rational design', 'unprecedented stability'], ['optical Passivation strategies', 'based LEDs'], ['different PSC device Review', 'large scale commercialization'], ['Finally Si we', 'thorough silicon future']]
def obtain_relation(sent):
doc = nlp(sent)
matcher = Matcher(nlp.vocab)
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}]
matcher.add("matching_1", [pattern]) # Pass the pattern as a list
matcher = matcher(doc)
h = len(matcher) - 1
relation = [doc[matcher[i][1]].text for i in range(h)]
return relation
from spacy.matcher import Matcher
# extract the first 800 first abstacts
relations = [obtain_relation(str(abstract)) for abstract in tqdm(df['Abstract'][:800])]
100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:24<00:00, 33.32it/s]
# subject extraction
source = [j[0] for j in pairs_of_entities]
#object extraction
target = [k[1] for k in pairs_of_entities]
data_kgf = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
# i decided not to take the frequency of words appearing more that once in the edge
data_kgf
| source | target | edge | |
|---|---|---|---|
| 0 | current perovskite I | solar cells | [target, are, are, discuss, demonstrated, give] |
| 1 | we | dimensional quantum dots | [emerged, emerged, perovskite, perovskite, rem... |
| 2 | top we | high field | [exploring, exploring, combined, combined, nee... |
| 3 | charge selective charge recap | previously carrier recombination losses | [attracted, suffer, emerged, published, climbe... |
| 4 | open chemical questions | low development toxicity | [drawn, drawn, presents, outlines, discussed, ... |
| ... | ... | ... | ... |
| 795 | theoretical charge model | solar cells applications | [revolutionized, dominated, devoted, shown, in... |
| 796 | electron transport we | working carrier transport devices | [is, is, is, examine, find] |
| 797 | comprehensive review | engineering PSCs | [become, restrict, needed, are, specified, pro... |
| 798 | more lattice findings | solar cells | [exhibit, exhibit, encountered, are, employed,... |
| 799 | enhanced opencircuit performance | 6.03 power conversion % | [attracted, attracted, cause, cause, is, is, r... |
800 rows × 3 columns
import matplotlib.pyplot as plt
import networkx as nx
# Create DG from the dataframe
graph = nx.from_pandas_edgelist(data_kgf, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
# Plotting the network
fig, ax = plt.subplots(figsize=(14, 14)) # Use subplots to access the Axes object
posn = nx.spring_layout(graph)
nx.draw(graph, with_labels=True, node_color='green', edge_cmap=plt.cm.Blues, pos=posn, ax=ax)
plt.show()
# the above pic is of no use, lets try to make it presentable
!pip install plotly
import plotly.graph_objects as go
import networkx as nx
# Create DG from the dataframe
graph = nx.from_pandas_edgelist(data_kgf, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
# Get node positions
posn = nx.spring_layout(graph)
# Create edges
edge_x = []
edge_y = []
for edge in graph.edges():
x0, y0 = posn[edge[0]]
x1, y1 = posn[edge[1]]
edge_x.append(x0)
edge_x.append(x1)
edge_x.append(None)
edge_y.append(y0)
edge_y.append(y1)
edge_y.append(None)
# Create nodes
node_x = []
node_y = []
node_text = []
for node in graph.nodes():
x, y = posn[node]
node_x.append(x)
node_y.append(y)
node_text.append(node)
# Create figure
fig = go.Figure()
# Add edges
fig.add_trace(go.Scatter(
x=edge_x, y=edge_y,
mode='lines',
line=dict(color='rgb(210,210,210)', width=1),
hoverinfo='none'
))
# Add nodes
fig.add_trace(go.Scatter(
x=node_x, y=node_y,
mode='markers',
marker=dict(symbol='circle-dot',
size=10,
color='green',
line=dict(color='rgb(50,50,50)', width=0.5)),
hoverinfo='text',
text=node_text
))
# Configure layout
fig.update_layout(
title='Network Graph',
title_x=0.5,
showlegend=False,
hovermode='closest',
margin=dict(b=20, l=5, r=5, t=40),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)
# Show plot
fig.show()
Requirement already satisfied: plotly in c:\programdata\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from plotly) (8.0.1)
WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages) WARNING: Ignoring invalid distribution -ygments (c:\programdata\anaconda3\lib\site-packages)
import matplotlib.pyplot as plt
import networkx as nx
selected_data_kgf = data_kgf.head(20)
# Create a directed graph (DiGraph) from the selected DataFrame
graph = nx.from_pandas_edgelist(selected_data_kgf, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph())
# Plotting the network
fig, ax = plt.subplots(figsize=(14, 14)) # Use subplots to access the Axes object
posn = nx.spring_layout(graph)
nx.draw(graph, with_labels=True, node_color='green', edge_cmap=plt.cm.Blues, pos=posn, ax=ax)
plt.show()
import networkx as ntx
import matplotlib.pyplot as plot
graph = ntx.from_pandas_edgelist(data_kgf[data_kgf['edge'] == "Information from"], "source", "target",
edge_attr=True, create_using=ntx.MultiDiGraph())
fig, ax = plot.subplots(figsize=(14, 14))
pos = ntx.spring_layout(graph, k=0.5) # k regulates the distance between nodes
ntx.draw(graph, with_labels=True, node_color='green', node_size=1400, edge_cmap=plot.cm.Blues, pos=pos, ax=ax)
plot.show()
['node1', 'node2', 'node3']